# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import redis,re

class BooklistPipeline(object):
    '''将图书详情页URL存储到Redis的Pipeline类'''

    def __init__(self,host,port,db):
        '''初始化Pipeline'''
        #连接redis数据库
        self.r = redis.Redis(host=host, port=port, db=db, decode_responses=True)
        #self.redis_url = 'redis://password:@localhost:6379/'  
        #self.r = redis.Redis.from_url(self.redis_url,decode_responses=True)  

    @classmethod
    def from_crawler(cls,crawler):
        '''注入实例化对象（传入参数）'''
        return cls(
            host = crawler.settings.get("REDIS_HOST"),
            port = crawler.settings.get("REDIS_PORT"),
            db = crawler.settings.get("REDIS_DB")
        )

    def process_item(self, item, spider): 
        '''处理图书详情页URL的Item，存储到Redis数据库，同时进行去重处理''' 
        #使用正则判断url地址是否有效，并写入redis。
        book_id = re.findall('book.douban.com/subject/([0-9]+)/', item['url'])
        if book_id:
            #判断是否是重复的book_id
            if self.r.sadd('BookSpider:book_ids', book_id[0]):
                self.r.lpush('BookSpider:start_urls', item['url'])
        else:
            self.r.lpush('BookSpider:no_urls', item['url'])
